dplyr (e.g.,
%>%, mutate and select) and
tidyr (gather). If you don’t understand what’s
going on in a “chain” (operations linked by %>%), try to
run the chain sequentially (first only line 1, then lines 1 and 2, then
lines 1-3, etc.) and see what happens.packages <- c("plyr", "tidyr", "broom", "modelr", "glmnet", "selectiveInference", "MASS", "tidyverse", "boot")
for (p in packages)
library(p, character.only = TRUE)
knitr::opts_chunk$set(fig.align = "center")
theme_set(theme_minimal()+
theme(axis.title = element_text(size = 11),
axis.text = element_text(size = 10),
strip.text = element_text(size = 11)))
# Little helper to get the glmnet coefficients in nice tidy format
pretty_coefs <- function(coefs) { # coefs: the output from coef(fit_object, s = [value])
enframe(coefs[, 1], "predictor", "coefficient") %>%
filter(coefficient != 0) %>%
arrange(desc(abs(coefficient)))
}data(fev, package = "mplot")
head(fev)ggplot(fev, aes(x = age, y = fev)) +
geom_jitter(height = 0, width = 0.25, size = 0.5, alpha = 0.25) geom_smooth(se = FALSE) ## geom_smooth: na.rm = FALSE, orientation = NA, se = FALSE
## stat_smooth: na.rm = FALSE, orientation = NA, se = FALSE
## position_identity
geom_smooth(se = FALSE, method = "lm", colour = "black")## geom_smooth: na.rm = FALSE, orientation = NA, se = FALSE
## stat_smooth: na.rm = FALSE, orientation = NA, se = FALSE, method = lm
## position_identity
linmod <- lm(fev ~ age + height + I(height^2) + sex + smoke, data = fev)
tidy(linmod)titanic <- as.data.frame(Titanic) %>%
uncount(Freq)
logreg <- glm(Survived ~ Class + Sex + Age, data = titanic, family = binomial(link = "logit"))
tidy(logreg)data(PimaIndiansDiabetes2, package = "mlbench")
glimpse(PimaIndiansDiabetes2)## Rows: 768
## Columns: 9
## $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1…
## $ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,…
## $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N…
## $ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N…
## $ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17…
## $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.…
## $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158…
## $ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3…
## $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n…
summary(PimaIndiansDiabetes2)## pregnant glucose pressure triceps
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :29.00
## Mean : 3.845 Mean :121.7 Mean : 72.41 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## NA's :5 NA's :35 NA's :227
## insulin mass pedigree age diabetes
## Min. : 14.00 Min. :18.20 Min. :0.0780 Min. :21.00 neg:500
## 1st Qu.: 76.25 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00 pos:268
## Median :125.00 Median :32.30 Median :0.3725 Median :29.00
## Mean :155.55 Mean :32.46 Mean :0.4719 Mean :33.24
## 3rd Qu.:190.00 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.00 Max. :67.10 Max. :2.4200 Max. :81.00
## NA's :374 NA's :11
Very simplistic implementation but it illustrates the logic.
set.seed(42)
pima <- na.exclude(PimaIndiansDiabetes2) %>%
mutate(cv_fold = sample(1:n() %% 10 + 1, replace = FALSE))
table(pima$cv_fold) # fairly equal distribution##
## 1 2 3 4 5 6 7 8 9 10
## 39 40 40 39 39 39 39 39 39 39
err_cv <- c()
for (i in unique(pima$cv_fold)) {
train <- filter(pima, cv_fold != i)
mod <- glm(diabetes ~ age + mass + insulin + pregnant, data = train, family = binomial)
val <- filter(pima, cv_fold == i)
y_pred <- predict(mod, newdata = val, type = "response")
y_true <- as.numeric(val$diabetes) - 1 # bring binary factor to 0/1 scale
err <- mean(abs(y_true - y_pred) > 0.5)
err_cv <- c(err_cv, err)
}
err_cv## [1] 0.3333333 0.3250000 0.2564103 0.2051282 0.4358974 0.3076923 0.3000000
## [8] 0.1538462 0.2051282 0.2307692
mean(err_cv)## [1] 0.2753205
The approach you generally want to do (no need to re-invent the wheel).
binary_pred_cost <- function(y_true, y_pred) {
mean(abs(y_true - y_pred) > 0.5)
}
# Fit normal model
pima_glm <- glm(diabetes ~ age + mass + insulin + pregnant, data = pima, family = binomial)
# Use model object for cross-validation
pima_loo <- cv.glm(pima, pima_glm, cost = binary_pred_cost)
pima_loo$delta # 1st is raw estimate, 2nd is bias-corrected## [1] 0.2857143 0.2844908
pima_cv1 <- cv.glm(pima, pima_glm, cost = binary_pred_cost, K = 10)
pima_cv1$delta # 1st is raw estimate, 2nd is bias-corrected## [1] 0.2806122 0.2824149
The modelr-package has some powerful functionalities for
CV, LOOCV and bootstrapping. It’s more involved but may be worthwhile in
real life.
Look at the data (in Danish: vi skal tegne, før vi må regne)
data(biopsy)
summary(biopsy) # NA's in V6; mean varies across variables but anyway somewhere around 2 and 4## ID V1 V2 V3
## Length:699 Min. : 1.000 Min. : 1.000 Min. : 1.000
## Class :character 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000
## Mode :character Median : 4.000 Median : 1.000 Median : 1.000
## Mean : 4.418 Mean : 3.134 Mean : 3.207
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000 Max. :10.000
##
## V4 V5 V6 V7
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.000 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 1.000 Median : 2.000 Median : 1.000 Median : 3.000
## Mean : 2.807 Mean : 3.216 Mean : 3.545 Mean : 3.438
## 3rd Qu.: 4.000 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
## NA's :16
## V8 V9 class
## Min. : 1.000 Min. : 1.000 benign :458
## 1st Qu.: 1.000 1st Qu.: 1.000 malignant:241
## Median : 1.000 Median : 1.000
## Mean : 2.867 Mean : 1.589
## 3rd Qu.: 4.000 3rd Qu.: 1.000
## Max. :10.000 Max. :10.000
##
biopsy_complete <- na.exclude(biopsy) # remove rows with any missing value
biopsy_predictors <- biopsy_complete %>%
select(-ID, -class) %>%
scale() # note attributes "remember" normlisation factors; useful for transforming test set
bind_rows(gather(as_tibble(biopsy_predictors), var, value) %>%
mutate(scale = "normalised"),
gather(select(biopsy_complete, -ID, -class), var, value) %>%
mutate(scale = "original")) %>%
ggplot(aes(x = value, colour = scale)) +
geom_density(position = "identity") +
scale_x_continuous(breaks = -2:10) +
facet_wrap(~ var, scales = "free_y") +
theme(axis.text.y = element_blank())Fit model
lasso_logreg <- glmnet(biopsy_predictors, biopsy_complete$class, family = "binomial")Coefficient profile plot (built-in: ugly but easy). Below I’ve made a custom fit function using ggplot2 in case anyone’s interested
plot(lasso_logreg, xvar = "lambda", label = TRUE, lwd = 1.5)And let’s take a look at the non-zero coefficients (so the ones
selected by the lasso regression). I’ve a custom function
(pretty_coefs, see top of document) that returns these
coefficients in a nice format. In general, if you find yourself doing
the same (or almost the same) thing more than once, it’s usually a good
idea to pack that thing into a function. This gives shorter
code, which is easier to read, maintain and debug.
pretty_coefs(coef(lasso_logreg, s = exp(-2.5)))Alternative way to find the non-zero coefficients (this is basically
what I’ve packed into the pretty_coefs function.)
lasso_logreg_coefs <- coef(lasso_logreg, s = exp(-1.5))
lasso_logreg_coefs[which(lasso_logreg_coefs != 0), 1]## (Intercept) V2 V3 V6
## -0.6857851 0.2759895 0.2072426 0.4141450
# ridge_logreg <- update(lasso_logreg, alpha = 0)
ridge_logreg <- glmnet(biopsy_predictors, biopsy_complete$class, family = "binomial",
alpha = 0)
plot(ridge_logreg, xvar = "lambda")pretty_coefs(coef(ridge_logreg, s = exp(2))) # all shrunk but no real selectionelastic_logreg <- update(lasso_logreg, alpha = 0.5)
plot(elastic_logreg, xvar = "lambda")pretty_coefs(coef(elastic_logreg, s = exp(-1.2))) # all shrunk AND "ranking" (different coefficient sizes)ldply(list(lasso = lasso_logreg, ridge = ridge_logreg, elastic = elastic_logreg),
function(.) data.frame(as.matrix(t(.$beta)), x = apply(abs(.$beta), 2, sum)),
.id = "mod") %>%
gather(predictor, value, -x, -mod) %>%
mutate(mod = factor(mod, levels = c("lasso", "elastic", "ridge"),
labels = c("Lasso", "Elastic net (alpha = 0.5)", "Ridge"))) %>%
ggplot(aes(x, value, colour = predictor)) +
geom_line() +
labs(x = "l1 norm of coefficients", y = "Coefficient value") +
facet_wrap(~ mod, scales = "free_x")It’s a little data set but let’s try and do a 80%/20% split into training and test sets.
set.seed(42) # reproducible stochastic code
train_idx <- runif(nrow(biopsy_complete)) <= 0.8 # not exactly indices
biopsy_train <- filter(biopsy_complete, train_idx)
biopsy_test <- filter(biopsy_complete, !train_idx)
all_equal(biopsy_complete, bind_rows(biopsy_train, biopsy_test)) # sanity check## [1] TRUE
# Alternative with actual indices (mostly a matter of taste)
set.seed(42)
train_idx <- which(runif(nrow(biopsy_complete)) <= 0.8)
biopsy_train <- slice(biopsy_complete, train_idx)
biopsy_test <- slice(biopsy_complete, -train_idx)
all_equal(biopsy_complete, bind_rows(biopsy_train, biopsy_test)) # sanity check## [1] TRUE
# Normalise predictors and put them in matrix format
predictors_train <- biopsy_train %>%
select(-ID, -class) %>%
scale()
# Use normalisation factors from training predictors to scale the test predictors
predictors_test <- biopsy_test %>%
select(-ID, -class) %>%
scale(center = attr(predictors_train, "scaled:center"),
scale = attr(predictors_train, "scaled:scale"))
# Train model
lasso_biopsy_train <- glmnet(predictors_train, biopsy_train$class, family = "binomial")
# Prediction error in train and test sets
D_train <- predict(lasso_biopsy_train, predictors_train, type = "class") %>%
apply(2, function(.) mean(. != biopsy_train$class))
D_test <- predict(lasso_biopsy_train, predictors_test, type = "class") %>%
apply(2, function(.) mean(. != biopsy_test$class))
tibble(D_test = D_test[which.min(D_train)],
D_train = D_train[which.min(D_train)],
D_diff_abs = scales::percent(D_test - D_train),
D_diff_rel = scales::percent((D_test - D_train) / D_train, big.mark = ","))ggplot() + # could define the (common) x-axis variable already here, but simpler to do it for each geom
coord_cartesian(ylim = c(0, NA)) +
geom_line(aes(x = log(lasso_biopsy_train$lambda), y = D_train, colour = "Within-sample")) +
geom_line(aes(x = log(lasso_biopsy_train$lambda), y = D_test, colour = "Hold-out")) +
labs(y = "log(prediction error)", x = expression(log(lambda))) +
scale_x_reverse() The purpose of selective inference is to try and obtain reliable
confidence intervals (based on correct p values) for associations that
have already been selected using lasso regression. The coef
function requires more arguments than normally because it needs to
interpolate between the grid values of $lambda$ for which is was
trained. Also note that you need to divide the lambda
values with the number of observations (see Examples in
?fixedLassoInf).
plot(lasso_logreg, xvar = "lambda")lambda <- exp(-2)
beta <- coef(lasso_logreg, x = biopsy_predictors, y = biopsy_complete$class,
s = lambda/nrow(biopsy_complete), exact = TRUE)
delasso_fit <- fixedLassoInf(biopsy_predictors, as.numeric(biopsy_complete$class) - 1, beta,
lambda, "binomial", alpha = 0.05)
delasso_fit##
## Call:
## fixedLassoInf(x = biopsy_predictors, y = as.numeric(biopsy_complete$class) -
## 1, beta = beta, lambda = lambda, family = "binomial", alpha = 0.05)
##
## Testing results at lambda = 0.135, with alpha = 0.050
##
## Var Coef Z-score P-value LowConfPt UpConfPt LowTailArea UpTailArea
## 1 1.507 3.821 0.000 0.720 2.282 0.024 0.025
## 3 0.951 1.844 0.065 -0.331 2.067 0.025 0.024
## 4 0.945 2.746 0.006 0.221 1.625 0.024 0.025
## 5 0.214 0.621 0.536 -1.897 0.859 0.025 0.024
## 6 1.396 4.119 0.000 0.723 2.064 0.024 0.024
## 7 1.093 2.656 0.008 0.223 1.902 0.025 0.025
## 8 0.649 1.918 0.056 -0.183 1.336 0.024 0.024
## 9 0.924 1.654 0.105 -0.610 2.024 0.025 0.024
##
## Note: coefficients shown are full regression coefficients
Use cross-validation to find best \(\lambda\) value. We found quite clear over-fitting above. Let’s try to remedy this with cross-validation. We see that we need a fairly high penalty before we any real predictor selection, and that hurts the out-of-sample prediction performance.
# Use the training set only to fit the CV model
lasso_logreg_cv <- cv.glmnet(predictors_train, biopsy_train$class, family = "binomial", nfolds = 10)
plot(lasso_logreg_cv) # note that the y axis does NOT start at 0, which can be misleadingwith(lasso_logreg_cv, data.frame(lambda.min, lambda.1se))# With ggplot2 (more control and prettier) -- there are two helper functions to tune the appearance of
# the labels showing the number of predictors (you can choose which to use in the geom_text line)
fade_text <- function(x, alpha = 0.2) {
ifelse(paste(x) == lag(paste(x), default = ""), alpha, 1)
}
every_n <- function(x, n = 5) {
seq_along(x) %% n == 0
}
with(lasso_logreg_cv, tibble(lambda, cvm, cvup, cvlo, nzero)) %>%
ggplot(aes(x = log(lambda))) +
geom_vline(xintercept = log(lasso_logreg_cv$lambda.min), linetype = 2, size = 0.5, colour = "red") +
geom_vline(xintercept = log(lasso_logreg_cv$lambda.1se), linetype = 2, size = 0.5, colour = "dodgerblue") +
geom_linerange(aes(ymin = cvlo, ymax = cvup), size = 0.3) +
geom_ribbon(aes(ymin = cvlo, ymax = cvup), alpha = 0.1) +
geom_point(aes(y = cvm), shape = 18, size = 1.5) +
geom_text(aes(y = max(cvup) * 1.05, label = nzero, alpha = fade_text(nzero)), size = 8 / ggplot2::.pt, show.legend = FALSE) +
scale_alpha_identity() +
coord_cartesian(ylim = c(0, NA)) + # force y axis start at 0 and end where the data do
labs(x = expression(log(lambda)), y = lasso_logreg_cv$name) +
theme_minimal()train_pred_min <- predict(lasso_logreg_cv, predictors_train, s = "lambda.min", type = "class")
train_pred_1se <- predict(lasso_logreg_cv, predictors_train, s = "lambda.1se", type = "class")
test_pred_min <- predict(lasso_logreg_cv, predictors_test, s = "lambda.min", type = "class")
test_pred_1se <- predict(lasso_logreg_cv, predictors_test, s = "lambda.1se", type = "class")
data.frame(train_min = mean(biopsy_train$class == train_pred_min),
test_min = mean(biopsy_test$class == test_pred_min),
train_1se = mean(biopsy_train$class == train_pred_1se),
test_1se = mean(biopsy_test$class == test_pred_1se))alpha_lambda_cv_res <- expand.grid(lambda = exp(seq(-8, -1, length.out = 100)), # pretty much what's used in lasso_logreg_cv$lambda
alpha = seq(0, 1, 0.1)) %>%
dlply("alpha", function(d) cv.glmnet(predictors_train, biopsy_train$class, family = "binomial", nfolds = 10, lambda = d$lambda_seq)) %>%
llply(function(fit) mutate(tidy(fit),
lambda_level = case_when(lambda == fit$lambda.min ~ "min", lambda == fit$lambda.1se ~ "1se"),
pred_err_mean = fit$cvm,
pred_err_up = fit$cvup,
pred_err_lo = fit$cvlo)) %>%
bind_rows(.id = "alpha")
# Overlain plot of the best and "second best" (within 1 standard error of the best) predictions
ggplot(alpha_lambda_cv_res, aes(x = log(lambda), y = pred_err_mean, colour = alpha)) +
coord_cartesian(xlim = c(-6.5, -3), ylim = c(0.15, 0.35)) +
geom_line(alpha = 0.5) +
geom_point(aes(shape = lambda_level), ~ filter(., !is.na(lambda_level)), size = 2) +
labs(x = expression(log(lambda)), y = "Binomial deviance")# And the best combination of alpha and lambda
alpha_lambda_cv_res %>%
slice(which.min(pred_err_mean))library(MASS)
data(biopsy)
biopsy_complete <- na.exclude(biopsy)
summary(biopsy_complete)## ID V1 V2 V3
## Length:683 Min. : 1.000 Min. : 1.000 Min. : 1.000
## Class :character 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 1.000
## Mode :character Median : 4.000 Median : 1.000 Median : 1.000
## Mean : 4.442 Mean : 3.151 Mean : 3.215
## 3rd Qu.: 6.000 3rd Qu.: 5.000 3rd Qu.: 5.000
## Max. :10.000 Max. :10.000 Max. :10.000
## V4 V5 V6 V7
## Min. : 1.00 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 1.00 1st Qu.: 2.000 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 1.00 Median : 2.000 Median : 1.000 Median : 3.000
## Mean : 2.83 Mean : 3.234 Mean : 3.545 Mean : 3.445
## 3rd Qu.: 4.00 3rd Qu.: 4.000 3rd Qu.: 6.000 3rd Qu.: 5.000
## Max. :10.00 Max. :10.000 Max. :10.000 Max. :10.000
## V8 V9 class
## Min. : 1.00 Min. : 1.000 benign :444
## 1st Qu.: 1.00 1st Qu.: 1.000 malignant:239
## Median : 1.00 Median : 1.000
## Mean : 2.87 Mean : 1.603
## 3rd Qu.: 4.00 3rd Qu.: 1.000
## Max. :10.00 Max. :10.000
predictors <- biopsy_complete %>%
select(-ID, -class)
pca_fit <- prcomp(predictors, scale = TRUE)
df_pca <- data.frame(pca_fit$x[, 1:4], outcome = biopsy_complete$class)
glm_fit <- glm(outcome ~ PC1 + PC2 + PC3 + PC4, data = df_pca, family = binomial)
summary(glm_fit)##
## Call:
## glm(formula = outcome ~ PC1 + PC2 + PC3 + PC4, family = binomial,
## data = df_pca)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1791 -0.1304 -0.0619 0.0228 2.4799
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0739 0.3035 -3.539 0.000402 ***
## PC1 -2.4140 0.2556 -9.445 < 2e-16 ***
## PC2 -0.1592 0.5050 -0.315 0.752540
## PC3 0.7191 0.3273 2.197 0.028032 *
## PC4 -0.9151 0.3691 -2.479 0.013159 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 884.35 on 682 degrees of freedom
## Residual deviance: 106.12 on 678 degrees of freedom
## AIC: 116.12
##
## Number of Fisher Scoring iterations: 8
tidy(glm_fit)glm_fit_loocv <- cv.glm(df_pca, glm_fit)
glm_fit_loocv$delta## [1] 0.02301890 0.02301788
glm_fit_loocv2 <- cv.glm(df_pca, glm_fit, cost = function(r, pi = 0) mean(abs(r-pi) > 0.5))
glm_fit_loocv2$delta## [1] 0.02781845 0.02785918
We make a plot to talk about how a prediction yields different different costs depending on the cost function.
expand.grid(y_obs = 0:1,
y_pred = 0:100 / 100) %>%
mutate(cost_squared_error = (y_pred - y_obs)^2,
cost_binary = (abs(y_pred - y_obs) > 0.5) * 1,
cost_absolute_error = abs(y_pred - y_obs)) %>%
pivot_longer(starts_with("cost_"), names_to = "cost_fun", values_to = "cost") %>%
ggplot(aes(y_pred, cost, colour = cost_fun)) +
geom_line() +
facet_wrap(~ y_obs)# The error rates change quite a bit, which makes sense because 10-fold CV has a lot fewer folds than LOO which in this case has 683 folds.
glm_fit_10cv <- update(glm_fit_loocv, K = 10)
glm_fit_10cv$delta## [1] 0.02255579 0.02250644
# glm_fit_10cv2 <- cv.glm(df_pca, glm_fit, cost = function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 10)
glm_fit_10cv2 <- update(glm_fit_loocv2, K = 10)
glm_fit_10cv2$delta## [1] 0.02635432 0.02679163
ldply(list("LOO" = glm_fit_loocv, "LOO 2" = glm_fit_loocv2, "10-fold" = glm_fit_10cv, "10-fold 2" = glm_fit_10cv2),
with, delta) %>%
setNames(c("model", "error_rate", "corrected_error_rate"))# knitr::opts_chunk$set(include = FALSE)load(url("https://www.biostatistics.dk/teaching/advtopicsA/data/lassodata.rda"))
# Remove colums with zero variation (because they all have the same values)
genotype <- genotype[, apply(genotype, 2, var) > 0]
# De-duplicate genotype matrix (= make sure all columns are unique, avoid perfect collinearity between columns)
genotype <- unique(genotype, MARGIN = 2)
# Normalise the matrix to put all predictors on similar scales
genotype <- scale(genotype)
lasso <- glmnet(genotype, phenotype)
plot(lasso, xvar = "lambda")If predictors not on the same scale, those with large values will be discarded unduly from the model because they contribute a lot to the l1 norm.
lasso_cv <- cv.glmnet(genotype, phenotype, nfolds = 10)
plot(lasso_cv)pretty_coefs(coef(lasso, s = lasso_cv$lambda.min)) lasso_correct <- glmnet(genotype, phenotype, family = "binomial")
lasso_cv_correct <- cv.glmnet(genotype, phenotype, family = "binomial", nfolds = 10)
plot(lasso_cv_correct)pretty_coefs(coef(lasso_correct, s = lasso_cv_correct$lambda.min))Quite some difference between the sets of predictors kept:
full_join(pretty_coefs(coef(lasso_correct, s = lasso_cv_correct$lambda.min)),
pretty_coefs(coef(lasso, s = lasso_cv$lambda.min)), by = "predictor") %>%
setNames(c("predictor", "coefficient_correct", "coefficient_incorrect"))ridge_correct <- update(lasso_correct, alpha = 0)
plot(ridge_correct, xvar = "lambda")ridge_cv_correct <- update(lasso_cv_correct, alpha = 0)
full_join(pretty_coefs(coef(lasso_correct, s = lasso_cv_correct$lambda.min)),
pretty_coefs(coef(ridge_correct, s = ridge_cv_correct$lambda.min)), by = "predictor") %>%
setNames(c("predictor", "coefficient_lasso", "coefficient_ridge")) %>%
mutate(ridge_order = rank(abs(coefficient_ridge)),
ridge_order = max(ridge_order) - ridge_order + 1)The closer to zero the coefficients are, they less important they are to the prediction (thanks to normalisation of predictors), so one could set a threshold and only keep predictors with abs(coefficient) above that.
Ideally, one should do CV over \(\alpha\) as well (see the example in the code from the lecture).
elastic_correct <- update(lasso_correct, alpha = 0.5)
elastic_cv_correct <- update(lasso_cv_correct, alpha = 0.5)
full_join(pretty_coefs(coef(lasso_correct, s = lasso_cv_correct$lambda.min)),
pretty_coefs(coef(ridge_correct, s = ridge_cv_correct$lambda.min)), by = "predictor") %>%
full_join(pretty_coefs(coef(elastic_correct, s = elastic_cv_correct$lambda.min)), by = "predictor") %>%
setNames(c("predictor", "lasso", "ridge", "elastic"))delasso_formula <- pretty_coefs(coef(lasso_correct, s = lasso_cv_correct$lambda.min)) %>%
filter(predictor != "(Intercept)") %>%
with(predictor) %>%
paste(collapse = "+") %>%
paste("outcome ~", .) %>%
as.formula()
delasso_formula## outcome ~ V88 + V170 + V5 + V164 + V40 + V75 + V139 + V7 + V176 +
## V122 + V46 + V14
## <environment: 0x7ff52bb90260>
delasso_fit <- glm(delasso_formula, data = bind_cols(outcome = phenotype, data.frame(genotype)), family = binomial)
broom::tidy(delasso_fit)confint(delasso_fit) # DON'T USE, THESE ARE NOT CORRECT BECAUSE WE'VE SELECTED PREDICTORS## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.08785053 0.103052474
## V88 0.33821618 0.612647962
## V170 -1.37385115 2.142966524
## V5 0.21059442 0.425028150
## V164 0.18996878 0.399398108
## V40 -0.39099531 -0.163956447
## V75 -0.28239586 -0.010936988
## V139 -0.20876089 -0.009165485
## V7 -0.07691121 0.152738226
## V176 -1.73129609 1.785137425
## V122 -0.16416045 0.053407983
## V46 NA NA
## V14 NA NA
mean(phenotype == (predict(delasso_fit, type = "response") > 0.5))## [1] 0.664
Again, this is an active area of research so the package is not very
stable and the results are probably not be relied upon for now. Included
here to illustrate it can be done. Also, the fixedLassoInf
function returns an error due to singularity (at least as of 3 November
2020) owing to some columns being so similar that they seem identical to
the function (essentially, perfect collinearity).
cosine_similarity <- function(df) {
# df: matrix for whose columns the cosine similarity is to be computed
as.matrix(df) %>%
{ t(.) %*% . / sqrt(colSums(.^2) %*% t(colSums(.^2))) } %>%
as.data.frame() %>%
rownames_to_column("target_row") %>%
pivot_longer(-target_row, names_to = "target_col", values_to = "cosine_similarity")
}
cosine_similarity(genotype) %>%
arrange(desc(cosine_similarity)) lambda <- lasso_cv_correct$lambda.min
beta <- coef(lasso_correct, x = genotype, y = phenotype, exact = TRUE, s = lambda/nrow(genotype))
res <- fixedLassoInf(genotype, phenotype, beta, lambda, family = "binomial", alpha = 0.05, tol.beta = .Machine$double.eps)